Marion Coutarel
Projet 6
Juillet 2021

#colors
A1='#9E110B'
A2='#EB971D'
A3='#EB3C5A'
A4='#0EAEE8'
A5='#527B9F'
%%html
<style>
p{font-size : 1.5em}
b{color:#9E110B}
a{color:#EB971D}
h1{color : #9E110B; font-family: cursive, Verdana, sans-serif; text-shadow:1px 1px grey}
h2{font-family: cursive, Verdana, sans-serif; text-shadow:1px 1px grey;}
h3{color: #EB971D; text-shadow:1px 1px #e2e2e2}
h4{color: #527B9F}
</style>
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-colorblind')
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pd.options.plotting.backend = "plotly"
from datetime import datetime
def percConvert(ser):
return round(100*ser/float(ser[-1]),2)
customers=pd.read_csv('data/customers.csv')
customers.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8623 entries, 0 to 8622 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 client_id 8623 non-null object 1 sex 8623 non-null object 2 birth 8623 non-null int64 dtypes: int64(1), object(2) memory usage: 202.2+ KB
#check null
for col in customers.columns:
pct_missing = np.mean(customers[col].isnull())
print('{} - {}%'.format(col, round(pct_missing*100)))
client_id - 0% sex - 0% birth - 0%
#check unique
for col in customers.columns:
pct_unique =customers[col].nunique()/len(customers[col])
print('{} - {}%'.format(col, round(pct_unique*100)))
client_id - 100% sex - 0% birth - 1%
#check there is only 2 genders possible
customers.sex.unique()
array(['f', 'm'], dtype=object)
#Change birth columns type to datetimes
customers.birth=pd.to_datetime(customers.birth,format="%Y")
#oldest customer
print(customers.birth.min())
#yougest customer
print(customers.birth.max())
1929-01-01 00:00:00 2004-01-01 00:00:00
Customers born in 2004 seems to be over represented.
customers.birth.plot.bar()
len(customers.query("birth==2004"))
440
products=pd.read_csv('data/products.csv')
products.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3287 entries, 0 to 3286 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_prod 3287 non-null object 1 price 3287 non-null float64 2 categ 3287 non-null int64 dtypes: float64(1), int64(1), object(1) memory usage: 77.2+ KB
#check id are unique
products.id_prod.nunique()/len(products)
1.0
sns.heatmap(products[products.columns].isnull(), cmap=sns.color_palette([A5,A3]))
<AxesSubplot:>
#seems to have some wrong prices
products.price.min()
-1.0
products.price.max()
300.0
products.categ.unique()
array([0, 1, 2], dtype=int64)
products.categ=products.categ.astype(str)
transactions= pd.read_csv('data/transactions.csv')
transactions.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 679532 entries, 0 to 679531 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_prod 679532 non-null object 1 date 679532 non-null object 2 session_id 679532 non-null object 3 client_id 679532 non-null object dtypes: object(4) memory usage: 20.7+ MB
transactions.head()
| id_prod | date | session_id | client_id | |
|---|---|---|---|---|
| 0 | 0_1518 | 2022-05-20 13:21:29.043970 | s_211425 | c_103 |
| 1 | 1_251 | 2022-02-02 07:55:19.149409 | s_158752 | c_8534 |
| 2 | 0_1277 | 2022-06-18 15:44:33.155329 | s_225667 | c_6714 |
| 3 | 2_209 | 2021-06-24 04:19:29.835891 | s_52962 | c_6941 |
| 4 | 0_1509 | 2023-01-11 08:22:08.194479 | s_325227 | c_4232 |
I erease the minute and second from the date
transactions[['Date','A']]=transactions.date.str.split(' ',expand=True)
transactions.drop(['date','A'], axis=1, inplace=True)
We have 194 row begining with test. They all concern a highly suspicious product sold -1€ and what seems to be the 2 first customers of the customers files. I assume those are just test runs and drop the rows.
transactions.query("Date =='test_2021-03-01'").id_prod.unique()
array(['T_0'], dtype=object)
transactions.query("Date =='test_2021-03-01'").client_id.unique()
array(['ct_0', 'ct_1'], dtype=object)
products.query("id_prod=='T_0'")
| id_prod | price | categ | |
|---|---|---|---|
| 731 | T_0 | -1.0 | 0 |
transactions=transactions.query("Date!='test_2021-03-01'")
I set Date to a datetime format easier to work with.
transactions.Date=pd.to_datetime(transactions.Date)
transactions
| id_prod | session_id | client_id | Date | |
|---|---|---|---|---|
| 0 | 0_1518 | s_211425 | c_103 | 2022-05-20 |
| 1 | 1_251 | s_158752 | c_8534 | 2022-02-02 |
| 2 | 0_1277 | s_225667 | c_6714 | 2022-06-18 |
| 3 | 2_209 | s_52962 | c_6941 | 2021-06-24 |
| 4 | 0_1509 | s_325227 | c_4232 | 2023-01-11 |
| ... | ... | ... | ... | ... |
| 679527 | 0_1551 | s_150195 | c_8489 | 2022-01-15 |
| 679528 | 1_639 | s_181434 | c_4370 | 2022-03-19 |
| 679529 | 0_1425 | s_314704 | c_304 | 2022-12-20 |
| 679530 | 0_1994 | s_63204 | c_2227 | 2021-07-16 |
| 679531 | 1_523 | s_274568 | c_3873 | 2022-09-28 |
679332 rows × 4 columns
22 product references are not in transactions df.
Check= list(set(products.id_prod.unique()).difference(transactions.id_prod.unique()))
Check
['0_1645', '0_310', '0_1016', '0_1025', '0_1620', '1_0', '0_1800', '0_2308', '0_1624', '2_86', '0_1014', '0_322', '0_1318', '0_1780', '2_72', '0_1062', 'T_0', '2_87', '0_299', '0_1119', '1_394', '0_510']
1 reference is in transactions but not in the products list. It concerns 221 transactions. Those sales won't be analysed
Check=list(set(transactions.id_prod.unique()).difference(products.id_prod.unique()))
Check
['0_2245']
transactions.query('id_prod=="0_2245"')
| id_prod | session_id | client_id | Date | |
|---|---|---|---|---|
| 2633 | 0_2245 | s_272266 | c_4746 | 2022-09-23 |
| 10106 | 0_2245 | s_242482 | c_6713 | 2022-07-23 |
| 11727 | 0_2245 | s_306338 | c_5108 | 2022-12-03 |
| 15675 | 0_2245 | s_76493 | c_1391 | 2021-08-16 |
| 16377 | 0_2245 | s_239078 | c_7954 | 2022-07-16 |
| ... | ... | ... | ... | ... |
| 669730 | 0_2245 | s_80395 | c_131 | 2021-08-25 |
| 670682 | 0_2245 | s_175311 | c_4167 | 2022-03-06 |
| 671286 | 0_2245 | s_209381 | c_4453 | 2022-05-16 |
| 675679 | 0_2245 | s_163405 | c_1098 | 2022-02-11 |
| 677996 | 0_2245 | s_134446 | c_4854 | 2021-12-14 |
221 rows × 4 columns
df= pd.merge(products, transactions, how='inner', on='id_prod')
df
| id_prod | price | categ | session_id | client_id | Date | |
|---|---|---|---|---|---|---|
| 0 | 0_1421 | 19.99 | 0 | s_168213 | c_6389 | 2022-02-20 |
| 1 | 0_1421 | 19.99 | 0 | s_299590 | c_8364 | 2022-11-19 |
| 2 | 0_1421 | 19.99 | 0 | s_92304 | c_3544 | 2021-09-19 |
| 3 | 0_1421 | 19.99 | 0 | s_325369 | c_1025 | 2023-01-11 |
| 4 | 0_1421 | 19.99 | 0 | s_70071 | c_2298 | 2021-08-01 |
| ... | ... | ... | ... | ... | ... | ... |
| 679106 | 1_140 | 38.56 | 1 | s_231391 | c_974 | 2022-06-30 |
| 679107 | 0_1920 | 25.16 | 0 | s_334324 | c_7748 | 2023-01-30 |
| 679108 | 0_1920 | 25.16 | 0 | s_20115 | c_7088 | 2021-04-13 |
| 679109 | 0_1920 | 25.16 | 0 | s_41465 | c_7748 | 2021-05-30 |
| 679110 | 0_1920 | 25.16 | 0 | s_319303 | c_7748 | 2022-12-30 |
679111 rows × 6 columns
23 of our customers are not in the transactions dataset but all of those listed in the transactions dataset are in the custommers dataset
Check=list(set(transactions.client_id.unique()).difference(customers.client_id.unique()))
Check
[]
Check=list(set(customers.client_id.unique()).difference(transactions.client_id.unique()))
len(Check)
23
LP=pd.merge( df, customers, how='left', on='client_id')
LP
| id_prod | price | categ | session_id | client_id | Date | sex | birth | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0_1421 | 19.99 | 0 | s_168213 | c_6389 | 2022-02-20 | f | 1991-01-01 |
| 1 | 0_1421 | 19.99 | 0 | s_299590 | c_8364 | 2022-11-19 | m | 1973-01-01 |
| 2 | 0_1421 | 19.99 | 0 | s_92304 | c_3544 | 2021-09-19 | f | 1967-01-01 |
| 3 | 0_1421 | 19.99 | 0 | s_325369 | c_1025 | 2023-01-11 | m | 1984-01-01 |
| 4 | 0_1421 | 19.99 | 0 | s_70071 | c_2298 | 2021-08-01 | m | 1972-01-01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 679106 | 1_140 | 38.56 | 1 | s_231391 | c_974 | 2022-06-30 | f | 1981-01-01 |
| 679107 | 0_1920 | 25.16 | 0 | s_334324 | c_7748 | 2023-01-30 | f | 1989-01-01 |
| 679108 | 0_1920 | 25.16 | 0 | s_20115 | c_7088 | 2021-04-13 | m | 1987-01-01 |
| 679109 | 0_1920 | 25.16 | 0 | s_41465 | c_7748 | 2021-05-30 | f | 1989-01-01 |
| 679110 | 0_1920 | 25.16 | 0 | s_319303 | c_7748 | 2022-12-30 | f | 1989-01-01 |
679111 rows × 8 columns
LP.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 679111 entries, 0 to 679110 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_prod 679111 non-null object 1 price 679111 non-null float64 2 categ 679111 non-null object 3 session_id 679111 non-null object 4 client_id 679111 non-null object 5 Date 679111 non-null datetime64[ns] 6 sex 679111 non-null object 7 birth 679111 non-null datetime64[ns] dtypes: datetime64[ns](2), float64(1), object(5) memory usage: 46.6+ MB
There is a drop of sales in october 2021
df =LP.set_index('Date').resample("MS").sum().reset_index()
#showing the total sale by month thanks to resample('M')
colors = [A5,] * 24
colors[7]= A3
fig=px.bar(df,x='Date',y='price')
#x-axis
fig.update_xaxes(
tick0='2021-02-01',
dtick='M1',
tickformat="%b\n%Y",
)
#y-axis
fig.update_yaxes(
showgrid = True,
gridwidth=3,
gridcolor= 'black',
zeroline=True,
zerolinewidth=2,
zerolinecolor="grey",
title='sales amount',
titlefont= dict(family='sans-serif',
size= 20,
)
)
#title
fig.update_layout(
title = 'Total sales per month',
title_font_size= 30,
plot_bgcolor='white',
)
#changing bar colors
fig.update_traces(marker_color=colors)
fig.show()
There are some data missing concerning cat 1 in october 2021
sns.heatmap(pd.pivot_table(LP, index='Date', columns='categ', values='price', aggfunc=np.sum)[pd.pivot_table(LP, index='Date', columns='categ', values='price', aggfunc=np.sum).columns].isnull(), cmap=sns.color_palette([A5,A3]))
<AxesSubplot:xlabel='categ', ylabel='Date'>
For 25 days (from 2021-10-02 to 2021-10-27) there is no records of sales of cat 1 products
pd.pivot_table(LP.query('Date >"2021-09-30"& Date<"2021-11-01"'), index='Date', columns='categ', values='price', aggfunc=np.sum)
| categ | 0 | 1 | 2 |
|---|---|---|---|
| Date | |||
| 2021-10-01 | 6950.50 | 7003.79 | 3104.05 |
| 2021-10-02 | 7141.01 | NaN | 2041.12 |
| 2021-10-03 | 6786.57 | NaN | 2206.48 |
| 2021-10-04 | 6533.60 | NaN | 2746.08 |
| 2021-10-05 | 6357.91 | NaN | 3032.55 |
| 2021-10-06 | 7546.58 | NaN | 1944.11 |
| 2021-10-07 | 6404.01 | NaN | 1787.07 |
| 2021-10-08 | 7069.53 | NaN | 3137.82 |
| 2021-10-09 | 6808.69 | NaN | 2616.67 |
| 2021-10-10 | 6490.98 | NaN | 2334.67 |
| 2021-10-11 | 7005.40 | NaN | 3225.16 |
| 2021-10-12 | 6706.97 | NaN | 2264.18 |
| 2021-10-13 | 6760.40 | NaN | 2666.82 |
| 2021-10-14 | 6422.39 | NaN | 3047.39 |
| 2021-10-15 | 6697.08 | NaN | 2701.75 |
| 2021-10-16 | 6898.14 | NaN | 3330.88 |
| 2021-10-17 | 6492.60 | NaN | 3065.38 |
| 2021-10-18 | 6376.43 | NaN | 2707.29 |
| 2021-10-19 | 5785.71 | NaN | 2816.00 |
| 2021-10-20 | 5902.77 | NaN | 3118.42 |
| 2021-10-21 | 6438.85 | NaN | 3132.94 |
| 2021-10-22 | 6051.86 | NaN | 2416.00 |
| 2021-10-23 | 5704.26 | NaN | 3206.38 |
| 2021-10-24 | 6036.65 | NaN | 3886.54 |
| 2021-10-25 | 5817.82 | NaN | 2258.22 |
| 2021-10-26 | 6335.01 | NaN | 2685.78 |
| 2021-10-27 | 5822.24 | NaN | 3967.29 |
| 2021-10-28 | 5837.61 | 6317.99 | 2602.82 |
| 2021-10-29 | 6410.57 | 6425.18 | 3617.88 |
| 2021-10-30 | 5734.62 | 6753.69 | 2898.47 |
| 2021-10-31 | 5924.07 | 7261.67 | 3219.38 |
In order to deal with those missing values we replace the NaN by the mean to get a estimation of the actual value of October 21 sales.
sales_per_cat = pd.pivot_table(data=LP, index='Date', columns='categ', values='price', aggfunc=np.sum).fillna(
pd.pivot_table(data=LP, index='Date', columns='categ', values='price', aggfunc=np.sum).mean()).resample("MS").sum()
Sales are decreasing in february (-12% regarding january) that burden the sales increasing tendancy (+5% on the first 11 months) leading to an almost stable revenue from Year 1 (March 2021-Febuary 2022) to Year 2 (March 2022-Febuary 2023) around 6 M€ (+0.3%).
CA2 =round(np.sum(np.sum(sales_per_cat.query("Date>'2022-02-01'"))),0)
CA1=round(np.sum(np.sum(sales_per_cat.query("Date<'2022-03-01'"))),0)
print ("In year 1, total sales were {:,} €".format(CA1))
print ("In year 2, total sales were {:,} €".format(CA2))
print("Total sales increased of {}%.".format(round((CA2-CA1)/CA1*100,1)))
In year 1, total sales were 6,003,607.0 € In year 2, total sales were 6,021,992.0 € Total sales increased of 0.3%.
x =100*(np.sum(LP.query('Date<"2023-03-1" & Date >"2023-01-31"').price)-np.sum(LP.query('Date<"2023-02-01" & Date >"2022-12-31"').price))/np.sum(LP.query('Date<"2023-02-1" & Date >"2022-12-31"').price)
print( "Between february 2023 and january 2023, the total sales variation decreased of {:.2f}%".format(x))
Between february 2023 and january 2023, the total sales variation decreased of -11.76%
x=100*(np.sum(LP.query("Date>'2022-02-28' & Date <'2023-02-01'").price)-np.sum(LP.query("Date<'2022-02-01'").price))/np.sum(LP.query("Date<'2022-02-01'").price)
print("On the first eleven month, sales were up {}%".format(round(x,1)))
On the first eleven month, sales were up 5.1%
del x
colors=[A2,A4,A3]
fig=px.bar(sales_per_cat, x=sales_per_cat.index,y=['0','1','2'], color_discrete_sequence=colors)
#X axis
fig.update_xaxes(
tick0='2021-02-01',
dtick='M1',
tickformat="%b\n%Y")
#title
fig.update_layout(
title = 'Total sales per category per month',
title_font_size= 30,
plot_bgcolor='white',
)
#y-axis
fig.update_yaxes(
showgrid = True,
gridwidth=3,
gridcolor= 'black',
zeroline=True,
zerolinewidth=2,
zerolinecolor="grey",
title='sales amount',
titlefont= dict(family='sans-serif',
size= 20,
)
)
fig.show()
For a better view of sales per category variation over time, we create a base index 100 on march 2022.
base_0 =sales_per_cat['0'].loc['2022-03-01']
base_1 =sales_per_cat['1'].loc['2022-03-01']
base_2 =sales_per_cat['2'].loc['2022-03-01']
sales_per_cat['base_2'] = round(
100*sales_per_cat['2']/base_2, 2)
sales_per_cat['base_1']=round(100*sales_per_cat['1']/base_1,2)
sales_per_cat['base_0']=round(100*sales_per_cat['0']/base_0,2)
Sales amount are steadier this last year regarding of what occurred the year before. The decrease in sales in february is due to a decreased in the sales of all 3 categories but with a bette resilience of Cat 2.
fig = go.Figure()
fig.add_hline(y=100, line_dash='dot',line_color='black',line_width=5)
fig.add_trace(go.Bar(x=sales_per_cat.index,
y=sales_per_cat.base_0,
name='Cat 0 ',
marker_color=A2
))
fig.add_trace(go.Bar(x=sales_per_cat.index,
y=sales_per_cat.base_1,
name='Cat 1',
marker_color=A4
))
fig.add_trace(go.Bar(x=sales_per_cat.index,
y=sales_per_cat.base_2,
name='Cat 2',
marker_color=A3
))
fig.add_vrect(
x0="2022-02-15", x1="2022-03-15",
fillcolor="gainsboro", opacity=0.6,
layer="below", line_width=0,
)
#X axis
fig.update_xaxes(
tick0='2021-02-01',
dtick='M1',
tickformat="%b\n%Y")
#title
fig.update_layout(
title = 'sales amount base 100 on march 2022',
title_font_size= 30,
plot_bgcolor='white',
)
#y-axis
fig.update_yaxes(
showgrid = True,
gridwidth=3,
gridcolor= 'black',
zeroline=True,
zerolinewidth=2,
zerolinecolor="grey",
title='sales amount',
titlefont= dict(family='sans-serif',
size= 20,
)
)
At a yearly point of view, Revenue is stable from Year 1 (March 2021-Febuary 2022) to Year 2 (March 2022-Febuary 2023) around 6 M€ (+0.3%). The share of each category is quite stable too (slight increase of cat 2 , slight decrease of cat 0).
#Creating 2 variable to sort per year
year1="Date<'2022-03-01'"
year2="Date>'2022-02-01'"
sales_per_cat.drop(['base_2','base_1','base_0'], axis=1).query(year1)
| categ | 0 | 1 | 2 |
|---|---|---|---|
| Date | |||
| 2021-03-01 | 193629.17 | 186974.170000 | 101837.27 |
| 2021-04-01 | 205222.46 | 156138.350000 | 114748.49 |
| 2021-05-01 | 196186.72 | 165893.400000 | 130863.35 |
| 2021-06-01 | 167943.15 | 189162.040000 | 126983.37 |
| 2021-07-01 | 144750.79 | 188523.270000 | 149561.34 |
| 2021-08-01 | 167737.62 | 162991.380000 | 151555.79 |
| 2021-09-01 | 246353.91 | 190613.780000 | 70272.99 |
| 2021-10-01 | 199250.83 | 205632.760256 | 87785.59 |
| 2021-11-01 | 155909.56 | 252910.390000 | 107347.78 |
| 2021-12-01 | 206036.24 | 251026.750000 | 68854.29 |
| 2022-01-01 | 164210.51 | 256267.920000 | 104860.56 |
| 2022-02-01 | 183197.33 | 213120.640000 | 139253.53 |
colors=[A2,A4,A3]
fig = make_subplots(rows=1, cols=2, specs=[
[{'type': 'domain'}, {'type': 'domain'}]])
fig.add_trace(go.Pie(labels=['Cat 0', 'Cat 1', 'Cat 2'], values=np.sum(
sales_per_cat.query(year1)), name="Year 1"), row=1, col=1)
fig.add_trace(go.Pie(labels=['Cat 0', 'Cat 1', 'Cat 2'], values=np.sum(
sales_per_cat.query(year2)), name="Year 2"), row=1, col=2)
fig.update_traces(hole=.4, hoverinfo="label+percent+name",
marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(
title_text="Sales per category ",
annotations=[dict(text='Year 1', x=0.18, y=0.5, font_size=20, showarrow=False),
dict(text='Year 2', x=0.82, y=0.5, font_size=20, showarrow=False)])
fig.show()
LP['day']=LP.Date.dt.dayofweek
pd.pivot_table(LP.query(year2), index='day', values='price', aggfunc='sum')
| price | |
|---|---|
| day | |
| 0 | 942449.38 |
| 1 | 938768.96 |
| 2 | 940698.59 |
| 3 | 934602.79 |
| 4 | 916850.07 |
| 5 | 927609.31 |
| 6 | 939431.71 |
fig = px.line(x=pd.pivot_table(LP.query(year2), index='day', values='price', aggfunc='sum').index,
y=pd.pivot_table(LP.query(year2), index='day', values='price', aggfunc='sum').price)
# x-axis
fig.update_xaxes(
ticktext=["Mon","Tues",'Wed','Thur','Fri','Sat','Sun'],
tickvals=["0","1","2","3","4","5","6"],
title= None,
)
# y-axis
fig.update_yaxes(
showgrid=True,
gridwidth=3,
gridcolor='black',
zeroline=True,
zerolinewidth=2,
zerolinecolor="grey",
title='sales amount',
titlefont=dict(family='sans-serif',
size=20,
)
)
# title
fig.update_layout(
title='Sales per day (year 2)',
title_font_size=30,
plot_bgcolor='white',
)
fig.show()
There is only one client per session but there is 1587 sessions with 2 dates (0.5% of sessions dates - representing 0.7% of total sales). We need to get ride of those to go on with the session analysis.
col=['id_prod','categ','client_id','Date']
for item in col:
print(item)
print(LP.groupby('session_id')[item].agg(['nunique']).value_counts())
id_prod nunique 1 163529 2 90631 3 47402 4 23541 5 10164 6 4226 7 1828 8 598 9 247 10 74 11 23 12 6 13 5 14 1 dtype: int64 categ nunique 1 251265 2 89056 3 1954 dtype: int64 client_id nunique 1 342275 dtype: int64 Date nunique 1 340688 2 1587 dtype: int64
LP_passage=LP.groupby('session_id').nunique().reset_index()
list=LP_passage[LP_passage.Date>1].session_id
del LP_passage
LP2=LP[~LP.session_id.isin(list)]
print("the new df represents {:.2f}% of total sales amount".format(np.sum(LP2.price)/np.sum(LP.price)*100))
the new df represents 99.28% of total sales amount
Now we got ride of sessions with date conflict we can create a new df named Session
Session=LP2.groupby(['session_id','Date','client_id','sex','birth']).agg({'price': sum, 'id_prod':'count'}).reset_index()
We create an age category so we can see how our different customers segments spend on our website.
l=[]
for x in Session.birth:
if x.year>2003:
l.append('[0-20[')
elif x.year>1983:
l.append('[20-40[')
elif x.year>1953:
l.append('[40-70[')
else:
l.append('>70')
Session['Age_range']=l
T=Session.query(year2).set_index("Date").resample("MS").agg({'session_id':'count','client_id':'nunique','price':'sum','id_prod':'sum'}).reset_index()
The decreased that we observe in february is due to a lesser number of sessions -11,4% and unique client - 4% as the average session basket is remarkably stable this year.
colors=[A2,A4,A3]
fig = make_subplots(rows=3, cols=1,subplot_titles=("NB Sessions","NB clients", "Average customer spending (€)"),shared_xaxes=True,vertical_spacing=0.06)
fig.append_trace(go.Bar(x=T.Date, y=T.session_id, marker_color= A3, name='sessions'), row=1, col=1)
fig.append_trace(go.Bar(x=T.Date, y=T.client_id, marker_color= A2, name='clients'), row=2, col=1)
fig.append_trace(go.Bar(x=T.Date, y=(T.price/T.session_id), marker_color= A5,name='customer spending'), row=3, col=1)
#x-axis
fig.update_xaxes(tick0='2021-02-01',dtick='M1',tickformat="%b\n%Y",)
#y-axis
fig.update_yaxes(
showgrid = True,
gridwidth=3,
gridcolor= 'black',
zeroline=True,
zerolinewidth=2,
zerolinecolor="grey",
)
#title
fig.update_layout(
title = 'Sessions over time',
title_font_size= 30,
plot_bgcolor='white',
)
fig.show()
del T
The youger the customer the higher the customer's basket.
But under 40 represents less than a third of sessions.
We cannot see significative difference between women and men in the purchase profile within age groups.
fig = go.Figure()
fig.add_trace(go.Box(
x=Session.sort_values(by='birth', ascending=False).Age_range, y=Session.sort_values(by='birth', ascending=False).price,
name='Session amount',
marker_color=A5,
boxmean=True,
))
#y axis
fig.update_yaxes(
showgrid = True,
gridwidth=3,
gridcolor= 'black',
zeroline=True,
zerolinewidth=2,
zerolinecolor="grey",
title="Amount",
)
#title
fig.update_layout(
title = 'customers spending per session according to age group ',
title_font_size= 30,
plot_bgcolor='white',
)
fig = px.box(Session.sort_values(by='birth', ascending=False), x="Age_range", y="price", color="sex")
fig.show()
T=pd.pivot_table(Session.query(year2), index='Date',columns='Age_range', values='session_id', aggfunc='count').resample('MS').agg(sum)
T['Tot']=T['>70']+T['[0-20[']+T['[20-40[']+T['[40-70[']
T.apply(percConvert, axis=1)
| Age_range | >70 | [0-20[ | [20-40[ | [40-70[ | Tot |
|---|---|---|---|---|---|
| Date | |||||
| 2022-02-01 | 8.11 | 2.63 | 30.04 | 59.23 | 100.0 |
| 2022-03-01 | 7.66 | 2.43 | 29.23 | 60.68 | 100.0 |
| 2022-04-01 | 7.70 | 2.77 | 30.28 | 59.26 | 100.0 |
| 2022-05-01 | 7.98 | 2.31 | 30.13 | 59.58 | 100.0 |
| 2022-06-01 | 8.12 | 2.55 | 29.17 | 60.16 | 100.0 |
| 2022-07-01 | 7.60 | 2.53 | 29.68 | 60.19 | 100.0 |
| 2022-08-01 | 7.90 | 2.62 | 29.87 | 59.62 | 100.0 |
| 2022-09-01 | 7.91 | 2.38 | 30.13 | 59.58 | 100.0 |
| 2022-10-01 | 7.44 | 2.46 | 29.83 | 60.26 | 100.0 |
| 2022-11-01 | 7.71 | 2.62 | 29.68 | 59.99 | 100.0 |
| 2022-12-01 | 7.61 | 2.54 | 30.41 | 59.44 | 100.0 |
| 2023-01-01 | 8.17 | 2.70 | 29.75 | 59.38 | 100.0 |
| 2023-02-01 | 7.79 | 2.69 | 29.86 | 59.66 | 100.0 |
print("year 2 :sessions per age group")
for col in T.columns:
a=round(100*np.sum(T[col])/len(Session.query(year2)),2)
print( "{}-{}%".format(col,a))
year 2 :sessions per age group >70-7.82% [0-20[-2.55% [20-40[-29.85% [40-70[-59.77% Tot-100.0%